library(readr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ dplyr   1.0.2
## ✓ tibble  3.0.4     ✓ stringr 1.4.0
## ✓ tidyr   1.1.2     ✓ forcats 0.5.0
## ✓ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
## ✓ broom     0.7.2      ✓ recipes   0.1.15
## ✓ dials     0.0.9      ✓ rsample   0.0.8 
## ✓ infer     0.5.3      ✓ tune      0.1.2 
## ✓ modeldata 0.1.0      ✓ workflows 0.2.1 
## ✓ parsnip   0.1.4      ✓ yardstick 0.0.7
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
library(cowplot)
winequality_red <- read_csv("winequality-red.csv") 
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   `fixed acidity` = col_double(),
##   `volatile acidity` = col_double(),
##   `citric acid` = col_double(),
##   `residual sugar` = col_double(),
##   chlorides = col_double(),
##   `free sulfur dioxide` = col_double(),
##   `total sulfur dioxide` = col_double(),
##   density = col_double(),
##   pH = col_double(),
##   sulphates = col_double(),
##   alcohol = col_double(),
##   quality = col_double()
## )
winequality_red <- winequality_red %>% mutate(perc_sulfur_dioxide = `free sulfur dioxide`/`total sulfur dioxide`) %>% select(-`free sulfur dioxide`, - `total sulfur dioxide`) %>% 
  mutate(alcohol_level = case_when(alcohol < 7 ~ "< 7 %"
                                   ,alcohol > 7 & alcohol <= 9 ~ "7 - 9 %"
                                   ,alcohol > 9 & alcohol <= 11 ~ "9 - 11 %"
                                   ,alcohol > 11 ~ "> 11 %")) %>% 
  mutate(quality_level = case_when(quality == 3 | quality == 4 ~ "low_quality"
                                   ,quality == 5 | quality == 6 ~ "medium_quality"
                                   ,quality == 7 | quality == 8 ~ "high_quality")) %>% mutate(quality = as.factor(quality))

EDA

cormat <- winequality_red %>% select_if(is.numeric) %>% cor()
cormat <- round(cormat ,2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
melted_cormat <- melt(cormat)


ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill = value))+
 geom_tile(color = "white")+
geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) 

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
winequality_red %>% filter(quality_level %in% c("high_quality", "low_quality")) %>% plot_ly(x = .$pH, y = .$alcohol, z = .$`residual sugar`, type="scatter3d", mode="markers", color = .$quality_level)
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

PCA

wine_pca <- winequality_red %>% select(where(is.numeric)) %>% # retain only numeric columns
  prcomp(scale = TRUE) 
wine_pca %>%
  augment(winequality_red) %>% # add original dataset back in
  ggplot(aes(.fittedPC1, .fittedPC2, color = quality)) + 
  geom_point(size = 1.5)

wine_pca %>%
  tidy(matrix = "eigenvalues") %>%
  ggplot(aes(PC, percent)) +
  geom_col(fill = "#56B4E9", alpha = 0.8) +
  scale_x_continuous(breaks = 1:9) +
  scale_y_continuous(
    labels = scales::percent_format(),
    expand = expansion(mult = c(0, 0.01))
  )

# define arrow style for plotting
arrow_style <- arrow(
  angle = 20, ends = "first", type = "closed", length = grid::unit(8, "pt")
)

# plot rotation matrix
wine_pca %>%
  tidy(matrix = "rotation") %>%
  pivot_wider(names_from = "PC", names_prefix = "PC", values_from = "value") %>%
  ggplot(aes(PC1, PC2)) +
  geom_segment(xend = 0, yend = 0, arrow = arrow_style) +
  geom_text(
    aes(label = column),
    hjust = 1, nudge_x = -0.02, 
    color = "#904C2F"
  ) +
  xlim(-1.25, .5) + ylim(-.5, 1) +
  coord_fixed() + # fix aspect ratio to 1:1
  theme_minimal_grid(12)
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_text).

points <- winequality_red %>% select(where(is.numeric))

kclusts <- 
  tibble(k = 1:9) %>%
  mutate(
    kclust = map(k, ~kmeans(points, .x)),
    tidied = map(kclust, tidy),
    glanced = map(kclust, glance),
    augmented = map(kclust, augment, points)
  )
clusters <- 
  kclusts %>%
  unnest(cols = c(tidied))

assignments <- 
  kclusts %>% 
  unnest(cols = c(augmented))

clusterings <- 
  kclusts %>%
  unnest(cols = c(glanced))
p1 <- 
  ggplot(assignments, aes(x = assignments$pH, y = assignments$chlorides)) +
  geom_point(aes(color = .cluster), alpha = 0.8) + 
  facet_wrap(~ k)
p1
## Warning: Use of `assignments$pH` is discouraged. Use `pH` instead.
## Warning: Use of `assignments$chlorides` is discouraged. Use `chlorides` instead.

Tidy way

library(tidymodels)

winequality_red_tidy <- winequality_red %>% select(where(is.numeric), quality)

pca_wine <- recipe( quality ~., data = winequality_red_tidy) %>%
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors())

pca_prep <- prep(pca_wine)

pca_prep
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor         10
## 
## Training data contained 1599 data points and no missing data.
## 
## Operations:
## 
## Centering and scaling for fixed acidity, volatile acidity, ... [trained]
## PCA extraction with fixed acidity, volatile acidity, ... [trained]
tidied_pca <- tidy(pca_prep, 2)

tidied_pca %>%
  filter(component %in% paste0("PC", 1:5)) %>%
  mutate(component = fct_inorder(component)) %>%
  ggplot(aes(value, terms, fill = terms)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~component, nrow = 1) +
  labs(y = NULL)

library(tidytext)

tidied_pca %>%
  filter(component %in% paste0("PC", 1:4)) %>%
  group_by(component) %>%
  top_n(8, abs(value)) %>%
  ungroup() %>%
  mutate(terms = reorder_within(terms, abs(value), component)) %>%
  ggplot(aes(abs(value), terms, fill = value > 0)) +
  geom_col() +
  facet_wrap(~component, scales = "free_y") +
  scale_y_reordered() +
  labs(
    x = "Absolute value of contribution",
    y = NULL, fill = "Positive?"
  )

juice(pca_prep) %>%
  ggplot(aes(PC1, PC2, label = quality)) +
  geom_point(aes(color = quality), alpha = 0.7, size = 2) +
  geom_text(check_overlap = TRUE, hjust = "inward") +
  labs(color = NULL)

umap

library(embed)

umap_rec <- recipe(quality ~., data = winequality_red_tidy) %>%
  step_normalize(all_predictors()) %>%
  step_umap(all_predictors())

umap_prep <- prep(umap_rec)

umap_prep
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor         10
## 
## Training data contained 1599 data points and no missing data.
## 
## Operations:
## 
## Centering and scaling for fixed acidity, volatile acidity, ... [trained]
## UMAP embedding for fixed acidity, volatile acidity, ... [trained]
juice(umap_prep) %>%
  ggplot(aes(umap_1, umap_2, label = quality)) +
  geom_point(aes(color = quality), alpha = 0.7, size = 2) +
  geom_text(check_overlap = TRUE, hjust = "inward") +
  labs(color = NULL)

autoencoders

rw <- read_csv("winequality-red.csv") %>% mutate(quality = as.factor(quality))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   `fixed acidity` = col_double(),
##   `volatile acidity` = col_double(),
##   `citric acid` = col_double(),
##   `residual sugar` = col_double(),
##   chlorides = col_double(),
##   `free sulfur dioxide` = col_double(),
##   `total sulfur dioxide` = col_double(),
##   density = col_double(),
##   pH = col_double(),
##   sulphates = col_double(),
##   alcohol = col_double(),
##   quality = col_double()
## )
set.seed(1353)
wine_split <- initial_split(rw)
train_data <- training(wine_split)
test_data <- testing(wine_split)
library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         1 days 14 hours 
##     H2O cluster timezone:       Europe/Berlin 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.32.0.1 
##     H2O cluster version age:    2 months and 6 days  
##     H2O cluster name:           H2O_started_from_R_Herdt_iaq382 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   0.98 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 4.0.2 (2020-06-22)
ae_model <- h2o.deeplearning(x = 1:11, training_frame = as.h2o(train_data),
autoencoder = TRUE,
reproducible = TRUE,
 seed = 148,
 hidden = c(10,10,10), epochs = 100,activation = "Tanh",
validation_frame = as.h2o(test_data))
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |=======================================================               |  78%
  |                                                                            
  |======================================================================| 100%
h2o.scoreHistory(ae_model)%>%head()
## Scoring History: 
##             timestamp   duration  training_speed  epochs iterations     samples
## 1 2020-12-15 08:07:47  0.033 sec 0,00000 obs/sec 0.00000          0    0.000000
## 2 2020-12-15 08:07:47  0.082 sec   27272 obs/sec 1.00000          1 1200.000000
## 3 2020-12-15 08:07:47  0.118 sec   32000 obs/sec 2.00000          2 2400.000000
## 4 2020-12-15 08:07:47  0.156 sec   33333 obs/sec 3.00000          3 3600.000000
## 5 2020-12-15 08:07:47  0.193 sec   34782 obs/sec 4.00000          4 4800.000000
## 6 2020-12-15 08:07:47  0.234 sec   34682 obs/sec 5.00000          5 6000.000000
##   training_rmse training_mse validation_rmse validation_mse
## 1       0.16596      0.02754         0.16777        0.02815
## 2       0.07281      0.00530         0.07075        0.00501
## 3       0.06205      0.00385         0.06009        0.00361
## 4       0.05521      0.00305         0.05307        0.00282
## 5       0.05035      0.00254         0.04859        0.00236
## 6       0.04493      0.00202         0.04328        0.00187
test_autoencoder <- h2o.predict(ae_model,  as.h2o(test_data))
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
train_features <- h2o.deepfeatures(ae_model, as.h2o(train_data), layer = 2) %>%
  as.data.frame() %>%cbind(train_data %>% 
  mutate(quality_level = case_when(quality == 3 | quality == 4 ~ "low_quality"
                                   ,quality == 5 | quality == 6 ~ "medium_quality"
                                   ,quality == 7 | quality == 8 ~ "high_quality")) %>% select(quality_level))
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
ggplot(train_features, aes(x = DF.L2.C1, y = DF.L2.C4, color = quality_level)) +
  geom_point(alpha = 0.9,size=1.5)+theme_bw()

train_features %>% plot_ly(x = .$DF.L2.C1, y = .$DF.L2.C2, z = .$DF.L2.C3, type="scatter3d", mode="markers", color = .$quality_level)

Anomaly detection

anomaly <- h2o.anomaly(ae_model, as.h2o(test_data)) %>%
  as.data.frame() %>%
  tibble::rownames_to_column() %>%cbind(test_data %>% 
  mutate(Class = case_when(quality == 3 | quality == 4 ~ "low_quality"
                                   ,quality == 5 | quality == 6 ~ "medium_quality"
                                   ,quality == 7 | quality == 8 ~ "high_quality")) %>% select(Class))
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
mean_mse <- anomaly %>%
  group_by(Class) %>%
  summarise(mean = mean(Reconstruction.MSE))
## `summarise()` ungrouping output (override with `.groups` argument)
anomaly<-anomaly%>%mutate_if(is.character,as.factor)
anomaly$rowname=as.numeric(anomaly$rowname)
anomaly%>%head()
##   rowname Reconstruction.MSE          Class
## 1       1       3.100766e-05 medium_quality
## 2     112       8.333021e-05 medium_quality
## 3     223       9.802102e-05 medium_quality
## 4     334       1.859427e-05 medium_quality
## 5     345       1.329144e-04 medium_quality
## 6     356       5.681355e-04 medium_quality
wine.anon = h2o.anomaly(ae_model, as.h2o(test_data), per_feature=FALSE)
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
MSE<-wine.anon%>%as_tibble()
MSE$Index<-1:length(MSE$Reconstruction.MSE)
ggplot(MSE,aes(x=Index,y=sort(Reconstruction.MSE)))+geom_point()+ylab("Reconstruction.MSE")

anomaly%>%ggplot( aes(x = rowname, y = Reconstruction.MSE,color=Class)) +
  geom_point(alpha = 0.5) +
  #geom_hline(data = mean_mse, aes(yintercept = mean)) +
  geom_hline(yintercept =0.004,color="#3288BD") +
  
  labs(x = "instance number", color = "Class")

anomaly <- anomaly %>%
  mutate(outlier = ifelse(Reconstruction.MSE > 0.004 , "outlier", "no_outlier"))
anomaly %>%
  group_by(Class, outlier) %>%
 dplyr:: summarise(n = n()) %>%
  mutate(freq = n / sum(n)) 
## `summarise()` regrouping output by 'Class' (override with `.groups` argument)
## # A tibble: 5 x 4
## # Groups:   Class [3]
##   Class          outlier        n    freq
##   <fct>          <chr>      <int>   <dbl>
## 1 high_quality   no_outlier    44 1      
## 2 low_quality    no_outlier    14 0.933  
## 3 low_quality    outlier        1 0.0667 
## 4 medium_quality no_outlier   338 0.994  
## 5 medium_quality outlier        2 0.00588